library(ggplot2) library(ggthemes) library(wordcloud) library(data.table) library(tm) library(SnowballC) tw_es <- read_rds("coronavirus_es.RDS") # Bag of words: ------------------------------------------------------------ # Creación del corpus corpus <- Corpus(VectorSource(tw_es$text)) strwrap(corpus[[1]]) # Convertimos todo el DF a minusculas corpus <- tm_map(corpus, tolower) # Eliminamos números corpus <- tm_map(corpus, removeNumbers) # Eliminamos las URL removeURL <- function(x) gsub("http[[:alnum:]]*", "", x) corpus <- tm_map(corpus, removeURL) corpus <- tm_map(corpus, gsub, pattern="rt", replacement="") # Eliminamos signos de puntuación corpus <- tm_map(corpus, removePunctuation) # Eliminamos espacios en blanco corpus <- tm_map(corpus, stripWhitespace) # Eliminamos stopwords corpus <- tm_map(corpus, removeWords, stopwords("spanish")) corpus <- tm_map(corpus, gsub, pattern="á", replacement="a") corpus <- tm_map(corpus, gsub, pattern="é", replacement="e") corpus <- tm_map(corpus, gsub, pattern="í", replacement="i") corpus <- tm_map(corpus, gsub, pattern="ó", replacement="o") corpus <- tm_map(corpus, gsub, pattern="ú", replacement="u") corpus <- tm_map(corpus, gsub, pattern="ñ", replacement="n") # Eliminamos emoticonos corpus <- tm_map(corpus, gsub, pattern = "[^\x01-\x7F]", replacement = "") # Nos quedamos con la raiz de las palabras (stem) corpus <- tm_map(corpus, stemDocument) # Convertimos el fichero a fichero plano corpus <- tm_map(corpus, PlainTextDocument) # Formato necesario para poder seguir trabajando con la BBDD corpus <- VCorpus(VectorSource(corpus$content$content))